set.seed(1)
project_2_data = 
  read_csv("./data/project_2_data.csv", na = c("NA", "", ".")) |>
  janitor::clean_names()  |>
  mutate(status = ifelse(status == "Dead", 1, 0))
## Rows: 4024 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Race, Marital Status, T Stage, N Stage, 6th Stage, differentiate, ...
## dbl  (5): Age, Tumor Size, Regional Node Examined, Reginol Node Positive, Su...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dead <- project_2_data |>
  filter(status == 1)

alive <- project_2_data |>
  filter(status == 0) |>
  sample_n(nrow(dead))

project_2_numdata<-
  bind_rows(dead, alive) |>
  mutate(
    t_stage = case_when(
    t_stage == "T1" ~ 1,
    t_stage == "T2" ~ 2,
    t_stage == "T3" ~ 3,
    t_stage == "T4" ~ 4,
    TRUE ~ NA_real_),
    n_stage = case_when(
    n_stage == "N1" ~ 1,
    n_stage == "N2" ~ 2,
    n_stage == "N3" ~ 3,
    TRUE ~ NA_real_),
    x6th_stage_num = case_when(
    x6th_stage == "IIA" ~ 1,
    x6th_stage == "IIB" ~ 2,
    x6th_stage == "IIIA" ~ 3,
    x6th_stage == "IIIB" ~ 4,
    x6th_stage == "IIIC" ~ 5,
    TRUE ~ NA_real_),
    differentiate = case_when(
    differentiate == "Well differentiated" ~ 1,
    differentiate == "Moderately differentiated" ~ 2,
    differentiate == "Poorly differentiated" ~ 3,
    differentiate == "Undifferentiated" ~ 4,
    TRUE ~ NA_real_),
    grade = case_when(
    grade == "anaplastic; Grade IV" ~ 4,
    grade == "3" ~ 3,
    grade == "2" ~ 2,
    grade == "1" ~ 1,
    TRUE ~ NA_real_),
    a_stage_regional = ifelse(a_stage == "Regional", 1, 0),
    estrogen_status = ifelse(estrogen_status == "Positive", 1, 0),
    progesterone_status = ifelse(progesterone_status == "Positive", 1, 0)
    ) |>
  select(-a_stage)

surv_object <- Surv(time = project_2_numdata$survival_months, event = project_2_numdata$status)
variables <- names(project_2_numdata)[!names(project_2_numdata) %in% c("survival_months", "status")]
for (var in variables) {
  formula <- as.formula(paste("Surv(survival_months, status) ~", var))
  model <- coxph(formula, data = project_2_numdata)
  print(var)
  print(summary(model))
}
## [1] "age"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##         coef exp(coef) se(coef)     z Pr(>|z|)   
## age 0.013313  1.013402 0.004454 2.989   0.0028 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## age     1.013     0.9868     1.005     1.022
## 
## Concordance= 0.536  (se = 0.012 )
## Likelihood ratio test= 9.05  on 1 df,   p=0.003
## Wald test            = 8.93  on 1 df,   p=0.003
## Score (logrank) test = 8.95  on 1 df,   p=0.003
## 
## [1] "race"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##              coef exp(coef) se(coef)      z Pr(>|z|)   
## raceOther -0.5965    0.5508   0.2099 -2.842  0.00448 **
## raceWhite -0.3263    0.7216   0.1252 -2.607  0.00914 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## raceOther    0.5508      1.816    0.3650    0.8310
## raceWhite    0.7216      1.386    0.5646    0.9222
## 
## Concordance= 0.527  (se = 0.008 )
## Likelihood ratio test= 9.46  on 2 df,   p=0.009
## Wald test            = 9.81  on 2 df,   p=0.007
## Score (logrank) test = 9.92  on 2 df,   p=0.007
## 
## [1] "marital_status"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                             coef exp(coef) se(coef)      z Pr(>|z|)  
## marital_statusMarried   -0.23876   0.78761  0.11794 -2.024   0.0429 *
## marital_statusSeparated  0.53873   1.71383  0.27908  1.930   0.0536 .
## marital_statusSingle    -0.10841   0.89726  0.14404 -0.753   0.4517  
## marital_statusWidowed    0.04877   1.04998  0.17757  0.275   0.7836  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## marital_statusMarried      0.7876     1.2697    0.6251    0.9924
## marital_statusSeparated    1.7138     0.5835    0.9918    2.9616
## marital_statusSingle       0.8973     1.1145    0.6766    1.1899
## marital_statusWidowed      1.0500     0.9524    0.7414    1.4871
## 
## Concordance= 0.537  (se = 0.011 )
## Likelihood ratio test= 12.55  on 4 df,   p=0.01
## Wald test            = 14  on 4 df,   p=0.007
## Score (logrank) test = 14.36  on 4 df,   p=0.006
## 
## [1] "t_stage"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##            coef exp(coef) se(coef)     z Pr(>|z|)    
## t_stage 0.38440   1.46874  0.04713 8.156 3.48e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## t_stage     1.469     0.6809     1.339     1.611
## 
## Concordance= 0.582  (se = 0.011 )
## Likelihood ratio test= 62.94  on 1 df,   p=2e-15
## Wald test            = 66.51  on 1 df,   p=3e-16
## Score (logrank) test = 67.36  on 1 df,   p=2e-16
## 
## [1] "n_stage"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##            coef exp(coef) se(coef)    z Pr(>|z|)    
## n_stage 0.61870   1.85652  0.04835 12.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## n_stage     1.857     0.5386     1.689     2.041
## 
## Concordance= 0.623  (se = 0.011 )
## Likelihood ratio test= 152  on 1 df,   p=<2e-16
## Wald test            = 163.7  on 1 df,   p=<2e-16
## Score (logrank) test = 175.6  on 1 df,   p=<2e-16
## 
## [1] "x6th_stage"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                  coef exp(coef) se(coef)      z Pr(>|z|)    
## x6th_stageIIB  0.4390    1.5512   0.1336  3.287  0.00101 ** 
## x6th_stageIIIA 0.7669    2.1530   0.1260  6.088 1.15e-09 ***
## x6th_stageIIIB 1.4792    4.3893   0.2464  6.003 1.94e-09 ***
## x6th_stageIIIC 1.5199    4.5718   0.1274 11.934  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                exp(coef) exp(-coef) lower .95 upper .95
## x6th_stageIIB      1.551     0.6447     1.194     2.015
## x6th_stageIIIA     2.153     0.4645     1.682     2.756
## x6th_stageIIIB     4.389     0.2278     2.708     7.115
## x6th_stageIIIC     4.572     0.2187     3.562     5.868
## 
## Concordance= 0.637  (se = 0.012 )
## Likelihood ratio test= 171.5  on 4 df,   p=<2e-16
## Wald test            = 177.7  on 4 df,   p=<2e-16
## Score (logrank) test = 198.8  on 4 df,   p=<2e-16
## 
## [1] "differentiate"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                  coef exp(coef) se(coef)     z Pr(>|z|)    
## differentiate 0.44124   1.55463  0.06433 6.859 6.94e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##               exp(coef) exp(-coef) lower .95 upper .95
## differentiate     1.555     0.6432      1.37     1.764
## 
## Concordance= 0.574  (se = 0.011 )
## Likelihood ratio test= 48.05  on 1 df,   p=4e-12
## Wald test            = 47.04  on 1 df,   p=7e-12
## Score (logrank) test = 47.37  on 1 df,   p=6e-12
## 
## [1] "grade"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##          coef exp(coef) se(coef)     z Pr(>|z|)    
## grade 0.44124   1.55463  0.06433 6.859 6.94e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##       exp(coef) exp(-coef) lower .95 upper .95
## grade     1.555     0.6432      1.37     1.764
## 
## Concordance= 0.574  (se = 0.011 )
## Likelihood ratio test= 48.05  on 1 df,   p=4e-12
## Wald test            = 47.04  on 1 df,   p=7e-12
## Score (logrank) test = 47.37  on 1 df,   p=6e-12
## 
## [1] "tumor_size"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                coef exp(coef) se(coef)    z Pr(>|z|)    
## tumor_size 0.011933  1.012005 0.001574 7.58 3.46e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## tumor_size     1.012     0.9881     1.009     1.015
## 
## Concordance= 0.592  (se = 0.012 )
## Likelihood ratio test= 50.4  on 1 df,   p=1e-12
## Wald test            = 57.45  on 1 df,   p=3e-14
## Score (logrank) test = 58.25  on 1 df,   p=2e-14
## 
## [1] "estrogen_status"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                    coef exp(coef) se(coef)      z Pr(>|z|)    
## estrogen_status -0.9108    0.4022   0.1064 -8.562   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## estrogen_status    0.4022      2.486    0.3265    0.4954
## 
## Concordance= 0.565  (se = 0.008 )
## Likelihood ratio test= 60.02  on 1 df,   p=9e-15
## Wald test            = 73.31  on 1 df,   p=<2e-16
## Score (logrank) test = 78.48  on 1 df,   p=<2e-16
## 
## [1] "progesterone_status"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                         coef exp(coef) se(coef)      z Pr(>|z|)    
## progesterone_status -0.71314   0.49010  0.08583 -8.308   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                     exp(coef) exp(-coef) lower .95 upper .95
## progesterone_status    0.4901       2.04    0.4142    0.5799
## 
## Concordance= 0.588  (se = 0.01 )
## Likelihood ratio test= 62.82  on 1 df,   p=2e-15
## Wald test            = 69.03  on 1 df,   p=<2e-16
## Score (logrank) test = 71.97  on 1 df,   p=<2e-16
## 
## [1] "regional_node_examined"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                            coef exp(coef) se(coef)     z Pr(>|z|)  
## regional_node_examined 0.010506  1.010561 0.004982 2.109    0.035 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                        exp(coef) exp(-coef) lower .95 upper .95
## regional_node_examined     1.011     0.9895     1.001      1.02
## 
## Concordance= 0.521  (se = 0.013 )
## Likelihood ratio test= 4.35  on 1 df,   p=0.04
## Wald test            = 4.45  on 1 df,   p=0.03
## Score (logrank) test = 4.44  on 1 df,   p=0.04
## 
## [1] "reginol_node_positive"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                           coef exp(coef) se(coef)     z Pr(>|z|)    
## reginol_node_positive 0.054926  1.056462 0.004627 11.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                       exp(coef) exp(-coef) lower .95 upper .95
## reginol_node_positive     1.056     0.9466     1.047     1.066
## 
## Concordance= 0.627  (se = 0.012 )
## Likelihood ratio test= 106.3  on 1 df,   p=<2e-16
## Wald test            = 140.9  on 1 df,   p=<2e-16
## Score (logrank) test = 148  on 1 df,   p=<2e-16
## 
## [1] "x6th_stage_num"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                   coef exp(coef) se(coef)     z Pr(>|z|)    
## x6th_stage_num 0.37716   1.45813  0.02834 13.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                exp(coef) exp(-coef) lower .95 upper .95
## x6th_stage_num     1.458     0.6858     1.379     1.541
## 
## Concordance= 0.637  (se = 0.012 )
## Likelihood ratio test= 169.3  on 1 df,   p=<2e-16
## Wald test            = 177.1  on 1 df,   p=<2e-16
## Score (logrank) test = 187  on 1 df,   p=<2e-16
## 
## [1] "a_stage_regional"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                     coef exp(coef) se(coef)      z Pr(>|z|)    
## a_stage_regional -1.1046    0.3314   0.1747 -6.324 2.55e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                  exp(coef) exp(-coef) lower .95 upper .95
## a_stage_regional    0.3314      3.018    0.2353    0.4666
## 
## Concordance= 0.522  (se = 0.005 )
## Likelihood ratio test= 29.49  on 1 df,   p=6e-08
## Wald test            = 39.99  on 1 df,   p=3e-10
## Score (logrank) test = 44.22  on 1 df,   p=3e-11

Based on the univariate Cox model results, the final model should include statistically significant and clinically relevant variables. These include age (p = 0.0028, HR = 1.013), race (p < 0.01, with disparities for Other and White), marital status (significant for Married, p = 0.0429, HR = 0.788), t_stage (p < 2e-16, HR = 1.469), n_stage (p < 2e-16, HR = 1.857), tumor size (p < 2e-14, HR = 1.012), differentiate (p < 7e-12, HR = 1.555), 6th_stage (significant for all categories, HR = 4.572 for IIIC), estrogen status (p < 2e-16, HR = 0.402), progesterone status (p < 2e-16, HR = 0.490), regional node positive (p < 2e-16, HR = 1.056), and a_stage (regional) (p < 3e-10, HR = 0.331). Variables like Single and Widowed marital status (p > 0.05) should be excluded. Further checks for multicollinearity and clinical relevance are necessary for final variable selection.

cor_matrix <- cor(project_2_numdata[, sapply(project_2_numdata, is.numeric)])
print(cor_matrix)
##                                age     t_stage     n_stage differentiate
## age                     1.00000000 -0.05011340 -0.01706295   -0.12321448
## t_stage                -0.05011340  1.00000000  0.30612098    0.15076838
## n_stage                -0.01706295  0.30612098  1.00000000    0.17991601
## differentiate          -0.12321448  0.15076838  0.17991601    1.00000000
## grade                  -0.12321448  0.15076838  0.17991601    1.00000000
## tumor_size             -0.07953275  0.78499755  0.30277440    0.14403058
## estrogen_status         0.12111658 -0.10672267 -0.12700846   -0.26247316
## progesterone_status     0.01233580 -0.07197461 -0.11108594   -0.21201525
## regional_node_examined -0.03141638  0.08844292  0.36808270    0.07651544
## reginol_node_positive   0.01628318  0.24426697  0.82874594    0.15163577
## survival_months        -0.08599693 -0.15971602 -0.24737291   -0.15052637
## status                  0.06976636  0.22525091  0.34556514    0.19151006
## x6th_stage_num         -0.01872612  0.57138048  0.90805991    0.19138867
## a_stage_regional        0.04777171 -0.23467556 -0.25575034   -0.04874199
##                              grade  tumor_size estrogen_status
## age                    -0.12321448 -0.07953275       0.1211166
## t_stage                 0.15076838  0.78499755      -0.1067227
## n_stage                 0.17991601  0.30277440      -0.1270085
## differentiate           1.00000000  0.14403058      -0.2624732
## grade                   1.00000000  0.14403058      -0.2624732
## tumor_size              0.14403058  1.00000000      -0.1230207
## estrogen_status        -0.26247316 -0.12302067       1.0000000
## progesterone_status    -0.21201525 -0.08438705       0.5850460
## regional_node_examined  0.07651544  0.11473470      -0.0414043
## reginol_node_positive   0.15163577  0.24878553      -0.1008736
## survival_months        -0.15052637 -0.16151178       0.2272740
## status                  0.19151006  0.19535827      -0.1757951
## x6th_stage_num          0.19138867  0.48127728      -0.1392492
## a_stage_regional       -0.04874199 -0.12796649       0.1279901
##                        progesterone_status regional_node_examined
## age                             0.01233580            -0.03141638
## t_stage                        -0.07197461             0.08844292
## n_stage                        -0.11108594             0.36808270
## differentiate                  -0.21201525             0.07651544
## grade                          -0.21201525             0.07651544
## tumor_size                     -0.08438705             0.11473470
## estrogen_status                 0.58504596            -0.04140430
## progesterone_status             1.00000000            -0.01390137
## regional_node_examined         -0.01390137             1.00000000
## reginol_node_positive          -0.07105526             0.49395643
## survival_months                 0.21200308            -0.03087185
## status                         -0.20001424             0.06199862
## x6th_stage_num                 -0.11070618             0.35066402
## a_stage_regional                0.07311096            -0.04237809
##                        reginol_node_positive survival_months      status
## age                               0.01628318     -0.08599693  0.06976636
## t_stage                           0.24426697     -0.15971602  0.22525091
## n_stage                           0.82874594     -0.24737291  0.34556514
## differentiate                     0.15163577     -0.15052637  0.19151006
## grade                             0.15163577     -0.15052637  0.19151006
## tumor_size                        0.24878553     -0.16151178  0.19535827
## estrogen_status                  -0.10087358      0.22727403 -0.17579507
## progesterone_status              -0.07105526      0.21200308 -0.20001424
## regional_node_examined            0.49395643     -0.03087185  0.06199862
## reginol_node_positive             1.00000000     -0.21263747  0.31134369
## survival_months                  -0.21263747      1.00000000 -0.58302039
## status                            0.31134369     -0.58302039  1.00000000
## x6th_stage_num                    0.78335778     -0.25969095  0.36006577
## a_stage_regional                 -0.19505294      0.13738966 -0.13123516
##                        x6th_stage_num a_stage_regional
## age                       -0.01872612       0.04777171
## t_stage                    0.57138048      -0.23467556
## n_stage                    0.90805991      -0.25575034
## differentiate              0.19138867      -0.04874199
## grade                      0.19138867      -0.04874199
## tumor_size                 0.48127728      -0.12796649
## estrogen_status           -0.13924922       0.12799010
## progesterone_status       -0.11070618       0.07311096
## regional_node_examined     0.35066402      -0.04237809
## reginol_node_positive      0.78335778      -0.19505294
## survival_months           -0.25969095       0.13738966
## status                     0.36006577      -0.13123516
## x6th_stage_num             1.00000000      -0.29133616
## a_stage_regional          -0.29133616       1.00000000
cortable<-project_2_numdata|>
  select(-race, -marital_status,-x6th_stage)
cortable$regional_node_examined
##    [1]  9  9 12 15 16 23 24  8  5 21  4  3 33  7 11  3 16  8 28 20 14 23 20 13
##   [25] 15 14 20 17 11 23  3 34  1 17 17  4 12 27 14 20 28 29 18  3  4 13 23 18
##   [49]  7 41 11 10 27 19  5 24  2 12 16 15 40  9 10 19 17  3 19 16  7 25  1  9
##   [73] 10 15  9 11 29 21 17 17  7 15 22 47 22 14  9  9 13 14 10 19 15  6  2 54
##   [97]  5 31 28 21 19  8 13 13  9 18 12 15 27 24  5  7 22 17 16 12  9 23 10  6
##  [121] 19  5 15 10 30 18 15 27 14  3 36 16  5  9  3 32 31 21 18 10  3 20 10 25
##  [145]  9 14  9 29 32 19 27  1 11 13 14 21 11 15 16 22  2 14 17  5 10 23 10  3
##  [169] 17 18 18 23  9  9  8 23 19 12  9 26 12 15 23  9 14 22 16 30  4  2 10 13
##  [193]  1 10  4 15  7  5 28 12 18 28 18 12  6 18 26  3 21 36 26 18 18 28 16 30
##  [217]  2  1 21 14 21  4 10 16 19 15 29 15 34 23 27 17 13 12  3 13 14 36 11 12
##  [241]  9  9  5 13 11 20 10 14 16 47 15 16 29 16  9  7 11  6 16 18 22 18  4 25
##  [265] 29  1 13 13  9 21 25 10  2  8 21  2 26 17  4  9 13 16 10 13  1  6  9  9
##  [289] 18 22  8 15  8  4  4 18 12 17 15 19 16 27 15 10 14 13 12 11  8 14 21 13
##  [313] 17  4 19 12 17  7 25 37  8 24 13 22 15  3 13  3  2 19 12 11 11  2 20 12
##  [337] 19 11 15 12 21 10 10 17  3 15 17 15 17 11  8 18 21 18 10 29  9 18 13 20
##  [361] 35 23 12  7 32 18 18  5 28 10 10 18 11  9  9  1 30  3  7  9 19 57  5 22
##  [385] 12 15 12  3  8 15 25 10  4 26 25 16 23 13 17  8 24  9 10  9 36 19 12  8
##  [409] 21  8 10 30 17 11 12 20 22 14  4 11  1  4 18 20 12 19 15 21 16 18 11 16
##  [433] 17  5 16 23  7  5 19  9 22 22  8 10 19 11 11  7 12  8 13 12 24 12 16 16
##  [457] 20 13  3 30 22 35 11 10  7 30 29 14 12 17  2 24 15 10  9 12  4 19 23 28
##  [481] 17 14 27 12 16 11 19 22 16  8  9 10 20 15  4 24 19 18 12  4  9 17 22  2
##  [505] 11  6  2 16 21 20 12 17 21  7 15 13  6  4  9 30  2 29 13  3  8  5 14  6
##  [529] 15  1 14 11 21 34 11 18 13  7 11 11  4 16 19 20 12 13 30 19 17 12 12 15
##  [553] 12  9 12  6  8  2 12 28 14 24 24  4 26  7 18  8 18 13 22  9 28 21 10 13
##  [577]  6 16 12  2 15 16 17 17 23 25 47 21 19 18 18  3 31 11  2 16 14 16 23 13
##  [601] 14  7 10  7 16 25 28 10 16 22 23 21 21 19  6  2  8  6 12  2  5 14 15  8
##  [625]  3 23 19 19 15  7  7 14 10 10 14 31 17 25  7 12 15 12  4 17 19 24 14 24
##  [649]  6 25  6 19 34 16  3  7 15 19  2 26  9 21  2  9 13 10 17 13  4 15  5 13
##  [673] 21  5 18  6 21 11 18 11  7  4  5  5 13 18 17  5  7 14 23 11 15 13 13 10
##  [697] 43 47 20 14  9 26  7 12 28  1  5 18 18 12 11 20 41 23 10 20 16 21 26 12
##  [721]  6 17 12 11 20 22 26 10  8 15 11  4 16 13 21 14 17  9 10  8  3 12 22 37
##  [745] 18  7 11 23 13 25 12 17 29  1 12  7  1 23  4 10 25 13  1  9  9 11 26  9
##  [769] 18  3  8 24  9 14 24 18  2 15 24  9 23 27 16 15 18 16 17 25 15 15 18 15
##  [793] 12  2 13 20  9 13 16 15 15  9 23 13  2 20 19 29 22  2 15  9 23  7 12  3
##  [817] 13  1 31 15 20 12 18 10 15  7 23 24 13  8 12 13 20  7 22  4 11 14 16 17
##  [841] 10 13 11 19 17 15 25 15 19  9 20 16 14  6 14 11 13 16  7 17 16 13  5 35
##  [865]  1 20 20  5  5 26 15 15 16 20 13 14  7 15 27 22 30  4 34  1 20 14 18 10
##  [889] 16  8 29  4  9  8  3 10 25 18 20  9 17 28 15 20  3 18  9 21 14  7  1 12
##  [913] 15  6 17 35 13 25 18  8 16 26  5  8  9 25 27 13  3 16 29  6 14  3 17 16
##  [937] 17 32  8 16  3  2 16 20 24 13 19 19 11  3 14 12  8  6  1 23  7  2  4 13
##  [961] 12  6 37 15 17  2 14 11  3 12 27  6 19  4 19 12 14 18 15 12 20 10 21  8
##  [985] 17 17  8 20 15 10  6 21  6  8 17 24  7 13 14 27 15 23 15 11  1 24 13 26
## [1009] 16 18  5 16 28 19 12 14 14 23 17 18 31 12 11  9 20 16 18 18  2 13  4 14
## [1033] 12 13  4  3  2  8 11  3  2  2  2 13 10  1 16 12 18  5 28 16 13 15  7 22
## [1057] 17 11 10 16 14  9  8 16 15  6 18 14 15 14  3 22 38  3  5 17  9 23 14 13
## [1081]  9 24  9  9  6 16 18 17 10 24  8 27  7 22 10 12 28 14  9  7 17 26 11 23
## [1105]  9 11 12 32 32  6 13 16  8 19 37  3 18 14 10 21 13 12 15  9  6  7  9 13
## [1129] 31 13 18  4  6 18 12  3 16 11  3 29 26 14  9  8 29 18 27 11 20 19  9 16
## [1153] 13  3  9  2 12 10  7 12 25 26 18 14 17 18  9  9 12 13 14 18 30 14  6 15
## [1177]  2 19 16 14  9 10  9  5  7 11  4 14 10  4  9 12 12  3 16 16 17 24  9 11
## [1201] 16 19 11 14  3  4  5  5 17 12 16 22 11 10  8 18 21 10  4 25  3 19 14 16
## [1225] 15 16 16 11 18 13 11 12
chart.Correlation(cortable, histogram=TRUE, pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter

We have to choice 1 between differentiate - grade t stage - tumor size n stage - regional_node_possitive - x6th_stage_num

muldata = project_2_numdata |>
  select(-differentiate, -x6th_stage_num)
  
cox_model <- coxph(Surv(survival_months, status) ~.,
                   data = muldata)

vifs <- vif(cox_model)  
## Warning in vif.default(cox_model): No intercept: vifs may not be sensible.
print(vifs)
##                             GVIF Df GVIF^(1/(2*Df))
## age                     1.116360  1        1.056580
## race                    1.122842  2        1.029389
## marital_status          1.299150  4        1.033255
## t_stage                 4.253731  1        2.062458
## n_stage                16.186662  1        4.023265
## x6th_stage             43.299742  4        1.601624
## grade                   1.132178  1        1.064038
## tumor_size              2.918845  1        1.708463
## estrogen_status         1.725718  1        1.313666
## progesterone_status     1.579248  1        1.256681
## regional_node_examined  1.772873  1        1.331493
## reginol_node_positive   4.389820  1        2.095190
## a_stage_regional        1.358703  1        1.165634

The VIF analysis reveals significant multicollinearity for n_stage (VIF = 10.94, GVIF^(1/(2Df)) =3.31), x6th_stage_num (VIF = 1.56, GVIF^(1/(2Df)) =1.25), reginol_node_positive (VIF = 4.13, GVIF^(1/(2Df)) =2.03), and t_stage (VIF = 3.83, GVIF^(1/(2Df)) =1.96), indicating redundancy. Tumor_size shows moderate multicollinearity (VIF = 2.62, GVIF^(1/(2Df)) =1.62), while other variables have acceptable VIFs near 1. Variables with high multicollinearity should be reconsidered for exclusion. also shows we need to remove. So I decided to remove t_stage, n_stage, reginol_node_positive,

final = muldata |>
  select(-t_stage, -n_stage, -reginol_node_positive)
  
cox_model <- coxph(Surv(survival_months, status) ~.,
                   data = final)
vifs <- vif(cox_model)
## Warning in vif.default(cox_model): No intercept: vifs may not be sensible.
print(vifs)
##                            GVIF Df GVIF^(1/(2*Df))
## age                    1.104634  1        1.051016
## race                   1.117617  2        1.028190
## marital_status         1.252430  4        1.028535
## x6th_stage             2.194858  4        1.103255
## grade                  1.124856  1        1.060592
## tumor_size             1.302777  1        1.141393
## estrogen_status        1.664705  1        1.290234
## progesterone_status    1.542471  1        1.241962
## regional_node_examined 1.260509  1        1.122724
## a_stage_regional       1.350791  1        1.162235
library(survival)
ph_test <- cox.zph(cox_model)
print(ph_test)
##                          chisq df       p
## age                     0.1735  1    0.68
## race                    0.7005  2    0.70
## marital_status          4.2550  4    0.37
## x6th_stage              7.5261  4    0.11
## grade                   0.0909  1    0.76
## tumor_size              0.1020  1    0.75
## estrogen_status        22.6787  1 1.9e-06
## progesterone_status    19.8507  1 8.4e-06
## regional_node_examined  0.1197  1    0.73
## a_stage_regional        0.6921  1    0.41
## GLOBAL                 49.9364 17 4.3e-05
plot(ph_test) 

The Cox model assumes that hazard ratios are constant over time. A non-significant p-value (p > 0.05) indicates that the PH assumption holds. As GLOBAL 50.520 14 5.0e-06, the model did not meet the assumption, as same as estrogen_status, progesterone_status, and a_stage_regional. We need further improve our model.

finalmodel <- coxph(Surv(survival_months, status) ~ age + race + marital_status +
                            grade + tumor_size + 
                            regional_node_examined + x6th_stage,
                            data = final)


ph_test <- cox.zph(finalmodel)
print(ph_test)
##                           chisq df    p
## age                    3.20e-01  1 0.57
## race                   5.65e-01  2 0.75
## marital_status         5.40e+00  4 0.25
## grade                  2.01e-01  1 0.65
## tumor_size             4.63e-04  1 0.98
## regional_node_examined 2.96e-02  1 0.86
## x6th_stage             6.12e+00  4 0.19
## GLOBAL                 1.31e+01 14 0.52
plot(ph_test) 

cox_model$concordance
##   concordant   discordant       tied.x       tied.y      tied.xy  concordance 
## 3.518730e+05 1.552820e+05 0.000000e+00 2.219000e+03 0.000000e+00 6.938175e-01 
##          std 
## 1.137217e-02
finalmodel$concordance
##   concordant   discordant       tied.x       tied.y      tied.xy  concordance 
## 3.397010e+05 1.674530e+05 1.000000e+00 2.219000e+03 0.000000e+00 6.698179e-01 
##          std 
## 1.162855e-02

No Clear Trends: If the solid line remains flat (close to zero), it indicates that the PH assumption is satisfied for that variable. Upward/Downward Trends: A visible trend or deviation indicates that the proportional hazards assumption may be violated for the corresponding variable, suggesting time-dependent effects.

A C-index of 0.716 suggests that the model has good discriminatory power, meaning it can correctly rank the survival times for about 71.6% of the pairs.The standard error is 0.01075, indicating a narrow range of variability in the concordance estimate, suggesting robust performance. The reduction in the C-index (from 73.9% to 71.6%) indicates a trade-off between model complexity and performance.

dev_residuals <- residuals(cox_model, type = "deviance")
plot(dev_residuals, main = "Deviance Residuals", ylab = "Residuals", xlab = "Index")
abline(h = c(-2, 2), col = "red", lty = 2) 

surv_fit <- survfit(Surv(survival_months, status) ~ 1, data = final)

plot(surv_fit, xlab = "Time (months)", ylab = "Survival Probability", 
     main = "Survival Curve for the Final Model", col = "blue", lwd = 2)

grid()